PaCMAP Benchmark#

Load data#

Hide code cell source
import pandas as pd
from source.pacmap_functions import *

input_path = '../Data/Intermediate_Files/'
output_path = '../Data/Processed_Data/'

df_methyl = pd.read_pickle(
    input_path+'df_batch_corrected.pkl').sort_index()

df_labels = pd.read_csv(
    input_path+'clinical_data.csv', index_col=0, low_memory=False).sort_index()

df_labels['PaCMAP Output'] = 'Patient Samples'
df_labels['Batch'] = df_methyl['Batch']

print(
f' Dataset (df) contains {df_methyl.shape[1]}\
 columns (5mC nucleotides/probes) and {df_methyl.shape[0]} rows (samples).')

output_notebook()

# Set the theme for the plot
curdoc().theme = 'light_minimal' # or 'dark_minimal'
 Dataset (df) contains 333352 columns (5mC nucleotides/probes) and 3330 rows (samples).
Loading BokehJS ...

All samples#

Hide code cell source
clinical_trials = ['NOPHO ALL92-2000', 
                    'AAML0531',
                    'AAML1031',
                    'Beat AML Consortium',
                    'TCGA AML',
                    'CETLAM SMD-09 (MDS-tAML)',
                    'French GRAALL 2003–2005',
                    'TARGET ALL',
                    'AAML03P1',
                    'Japanese AML05',
                    'CCG2961']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow',
                'Bone Marrow Normal','Primary Blood Derived Cancer - Peripheral Blood',
                'Blood Derived Normal','Likely Diagnosis', 'Control (Healthy Donor)',
                'Relapse','Recurrent Blood Derived Cancer - Bone Marrow',
                'Recurrent Blood Derived Cancer - Peripheral Blood',
                'Peripheral Blood Normal']

cols = ['PaCMAP Output','Pathology Class','WHO 2021 Diagnosis','WHO AML 2021 Diagnosis','WHO ALL 2021 Diagnosis','ELN AML 2022 Diagnosis',
        'Age (group years)', 'Batch', 'Sex', 'Clinical Trial', 'Sample Type', 'Karyotype', 'Gene Fusion', 'Patient_ID']

# processor = DataProcessor(df_labels.copy(), df_methyl, clinical_trials, sample_types, cols)
# processor.filter_data()
# processor.apply_pacmap()
# processor.join_labels()
# df = processor.df

# # Save output to avoid re-running the code multiple times
# df.to_csv(output_path+'pacmap_output/pacmap_2d_output_acute_leukemia.csv')

df = pd.read_csv(output_path+'pacmap_output/pacmap_2d_output_acute_leukemia.csv', index_col=0)

plotter = BokehPlotter(df, cols, get_custom_color_palette(), 
                       title='The Methylome Atlas of Acute Leukemia',
                       x_range=(-50, 50), y_range=(-50, 50),
                       datapoint_size=3)
plotter.plot()

Benchmark Classifiers#

Prepare data#

Hide code cell source
# exclude the samples with mixed phenotypes and Down syndrome and t(9;22)(q34.1;q11.2)/BCR::ABL1
df3 = df[~df['ELN AML 2022 Diagnosis'].isin(['Mixed phenotype acute leukemia T/myeloid',
                                       'Myeloid leukaemia associated with Down syndrome',
                                       'AML with t(9;22)(q34.1;q11.2)/BCR::ABL1'])]

# drop the samples with missing labels for the ELN AML 2022 Diagnosis
df3 = df3[~df3['ELN AML 2022 Diagnosis'].isna()]

# Define X and y
X = df3[['PaCMAP 1', 'PaCMAP 2']].to_numpy() # shape (n_samples=1399, n_features=2)
y = df3['ELN AML 2022 Diagnosis'].to_numpy() # shape (n_samples=1399,) with 11 string classes

from sklearn.gaussian_process import GaussianProcessClassifier

# Fit/predict
gpc = GaussianProcessClassifier(multi_class='one_vs_rest', random_state=42, n_jobs=-1)
_ = gpc.fit(X, y)
y_pred = gpc.predict(X)
y_pred_proba = gpc.predict_proba(X)
Hide code cell source
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

# drop the samples with missing labels for the ELN AML 2022 Diagnosis
df3 = df3[~df3['ELN AML 2022 Diagnosis'].isna()]

# Define X and y
X = df3[['PaCMAP 1', 'PaCMAP 2']].to_numpy() # shape (n_samples=1399, n_features=2)
y = df3['ELN AML 2022 Diagnosis'].to_numpy() # shape (n_samples=1399,) with 11 string classes

# Initialize Gaussian Process Classifier
gpc = GaussianProcessClassifier(multi_class='one_vs_rest', random_state=42, n_jobs=-1)

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Store predictions and probabilities
y_pred = np.empty_like(y, dtype=y.dtype)  # this will hold the predictions
y_pred_proba = np.zeros((len(y), len(np.unique(y))))  # this will hold the probabilities

# Perform cross-validation
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    gpc.fit(X_train, y_train)
    y_pred[test_index] = gpc.predict(X_test)
    y_pred_proba[test_index] = gpc.predict_proba(X_test)

# Show average accuracy score
accuracy = accuracy_score(y, y_pred)
print(f"Cross-validated Accuracy: {accuracy}")
Cross-validated Accuracy: 0.7805575411007862

Confusion matrix#

Hide code cell source
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_confusion_matrix(clf, x_test, y_test):
    sns.set_theme(style='white')
    predictions = clf.predict(x_test)
    cm = confusion_matrix(y_test, predictions, labels=clf.classes_, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=clf.classes_)
    disp.plot(cmap='Blues', values_format='.2f', xticks_rotation='vertical')

    # Increase the size of the plot
    fig = plt.gcf()
    fig.set_size_inches(10, 10)

    # Add title and axis names
    plt.title('Classification Results, n=' + str(len(x_test)), fontsize=14, fontweight='bold')
    plt.xlabel('Predicted', fontsize=14, fontweight='bold')
    plt.ylabel('Actual', fontsize=14, fontweight='bold')

    plt.show()

plot_confusion_matrix(gpc, X, y)
_images/4fdc8ece61cfa9def4308a1b9f34d21dbed1d853d6188a7d223f4897b3fc6b1b.png

ROC curves#

Hide code cell source
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score
import numpy as np

# Binarize the output
lb = LabelBinarizer()
y_bin = lb.fit_transform(df3['ELN AML 2022 Diagnosis'])
y_bin = np.array(y_bin)  # Ensure y_bin is a numpy array
n_classes = y_bin.shape[1]

# Convert y_pred_proba to numpy array
y_pred_proba = np.array(y_pred_proba)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Plot all ROC curves
plt.figure(figsize=(20,15))
for i in range(n_classes):
    plt.subplot(4, 3, i+1)
    plt.plot(fpr[i], tpr[i], color='b', lw=2, label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('%s' % lb.classes_[i])  # Show class names
    plt.legend(loc="lower right")

# Plot micro-average ROC curve on the 12th panel
plt.subplot(4, 3, 12)
plt.plot(fpr["micro"], tpr["micro"], color='b', lw=2, label='micro-average ROC curve (area = %0.2f)' % roc_auc["micro"])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Micro-average ROC curve')  # Title for micro-average
plt.legend(loc="lower right")

plt.tight_layout()  # Adjust spaces between the plots
plt.show()
_images/88986a570bb00c9adba62b1d6ff074350c6b55a28f397c4fc6274ba1f4a679e5.png

Apendix 1. Pediatric diagnostic AML samples#

Hide code cell source
clinical_trials = ['AAML0531', 'AAML1031', 'AAML03P1', 'CCG2961', 'Japanese AML05']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
                'Primary Blood Derived Cancer - Peripheral Blood', 'Blood Derived Normal']

cols = ['PaCMAP Output','Pathology Class','WHO AML 2021 Diagnosis','ELN AML 2022 Diagnosis', 'FAB', 'FLT3 ITD', 'Age (group years)',
        'Complex Karyotype', 'Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status',
        'Leucocyte counts (10⁹/L)', 'Risk Group', 'Race or ethnic group',
        'Clinical Trial', 'Vital Status','First Event','Sample Type','Karyotype', 'Gene Fusion', 'Patient_ID']

# processor = DataProcessor(df_labels.copy(), df_methyl, clinical_trials, sample_types, cols)
# processor.filter_data()
# processor.apply_pacmap()
# processor.join_labels()
# df2 = processor.df

# # Save output to avoid re-running the code multiple times
# df2.to_csv(output_path+'pacmap_output/pacmap_2d_output_peds_dx_aml.csv')

df2 = pd.read_csv(output_path+'pacmap_output/pacmap_2d_output_peds_dx_aml.csv', index_col=0)

plotter = BokehPlotter(df2, cols, get_custom_color_palette(),
                       title='Map of Pediatric AML at Diagnosis',
                        x_range=(-45, 45), y_range=(-45, 45),
                        datapoint_size=4)
plotter.plot()

Apendix 2. KMT2A diagnostic pediatric AML samples#

Hide code cell source
clinical_trials = ['AAML0531', 'AAML1031', 'AAML03P1', 'CCG2961', 'Japanese AML05']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
                'Primary Blood Derived Cancer - Peripheral Blood', 'Blood Derived Normal']

cols = ['PaCMAP Output','Gene Fusion','WHO AML 2021 Diagnosis','ELN AML 2022 Diagnosis', 'FAB', 'FLT3 ITD', 'Age (group years)',
        'Complex Karyotype', 'Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status', 
        'Leucocyte counts (10⁹/L)', 'Risk Group', 'Race or ethnic group','First Event',
        'Clinical Trial', 'Vital Status', 'Sample Type', 'Karyotype', 'Patient_ID']

kmt2a = df_labels[df_labels['ELN AML 2022 Diagnosis'].isin(['AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement'])]

# processor = DataProcessor(kmt2a, df_methyl, clinical_trials, sample_types, cols, remove_duplicates=False)
# processor.filter_data()
# processor.apply_pacmap()
# processor.join_labels()
# df3 = processor.df

# # Save output to avoid re-running the code multiple times
# df3.to_csv(output_path+'pacmap_output/pacmap_2d_output_kmt2a.csv')

df3 = pd.read_csv(output_path+'pacmap_output/pacmap_2d_output_kmt2a.csv', index_col=0)

plotter = BokehPlotter(df3, cols, get_custom_color_palette(),
                       title='KMT2A (MLL) Pediatric AML Samples',
                       x_range=(-40, 40), y_range=(-40, 40),
                       datapoint_size=9)
plotter.plot()

Watermark#

Author: Francisco_Marchi@Lamba_Lab_UF

Python implementation: CPython
Python version       : 3.8.16
IPython version      : 8.12.2

numpy  : 1.24.3
pandas : 2.0.2
bokeh  : 3.1.1
pacmap : 0.7.0
itables: 1.5.2

Compiler    : GCC 11.3.0
OS          : Linux
Release     : 5.15.90.1-microsoft-standard-WSL2
Machine     : x86_64
Processor   : x86_64
CPU cores   : 20
Architecture: 64bit